Society for Epidemiologic Research (SER) Workshop: |
Setup: Part 1/4
In this document, we will provide all steps and R codes required to implement the methods outlined in the slides. This will include the following methods
There will be four html files
- The setup (this file) (1/4)
- Difference-in-difference (DID) and related methods (2/4)
- Synthetic control methods and related methods (3/4)
- Heterogeneous treatment and staggered adoption (4/4)
Should you have any questions, need help to reproduce the analysis or find coding errors, please do not hesitate to contact us at niaroch@ucla.edu or tbenmarhnia@ucsd.edu.
To reproduce the html documents, you will need to install the following
Once everything is set up, we load the following packages:
if (!require("pacman")){
install.packages("pacman", repos = 'http://cran.us.r-project.org')
} # a nice package to load several packages simultaneously
p_load("tidyverse","magrittr","broom", #for manipulating data
"cleaR", #for clearing workspace
"here", #for directory managment
"Synth", #for the traditional synthetic control method
"gsynth", #for the generalized synthetic control method
"panelView", #for presenting data in a panel
"lme4", #for multi-level analysis
"estimatr", #for robust linear model
"did", #for staggered difference-in-difference
"gtsummary") #for nice tables
This is a simulated data where a policy (e.g. smoking ban) was implemented
- The policy was enacted in five states at the same time: Alabama, Alaska, Arizona, Arkansas and California
- The policy was enacted in 2000
- The unit of analysis is the state
- y is the outcome
- xi is a time-invariant variable (but varies across states)
- xt is a time-varying variable (but is constant across states)
- xit is a time-varying and unit-variable (i.e. varies by year and state)
Let’s load the data
mydata <- read_csv(here("data", "sim_data.csv"))
Create some useful variables within the datasets:
- an indicator for after the policy has been implemented: called this post
- an indicator for states that have implemented the policy: called this treated
- an interaction term between post and treated: called this treatedpost
- create a recentered year variable such that year_rec = 0 at the time of the policy
library(rmarkdown)
paged_table(head(mydata, 10))
glimpse(mydata)
Rows: 2,500
Columns: 11
$ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Ala…
$ state_num <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ year <dbl> 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, …
$ xit <dbl> -3.114551, -2.131945, -2.046409, -1.526400, 3.40…
$ xt <dbl> 0.9395244, 1.7698225, 4.0587083, 3.0705084, 3.62…
$ xi <dbl> 7.253319, 7.253319, 7.253319, 7.253319, 7.253319…
$ y <dbl> 7.928443, 13.078756, 17.154577, 19.731460, 36.67…
$ year_rec <dbl> -39, -38, -37, -36, -35, -34, -33, -32, -31, -30…
$ post <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ treated <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ treatedpost <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
p_load("skimr")
skim(mydata)
| Name | mydata |
| Number of rows | 2500 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 10 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| state | 0 | 1 | 4 | 14 | 0 | 50 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| state_num | 0 | 1 | 25.50 | 14.43 | 1.00 | 13.00 | 25.50 | 38.00 | 50.00 | ▇▇▇▇▇ |
| year | 0 | 1 | 1985.50 | 14.43 | 1961.00 | 1973.00 | 1985.50 | 1998.00 | 2010.00 | ▇▇▇▇▇ |
| xit | 0 | 1 | 4.98 | 18.86 | -65.71 | -3.39 | 2.67 | 13.94 | 89.51 | ▁▂▇▁▁ |
| xt | 0 | 1 | 13.78 | 7.26 | 0.94 | 7.90 | 12.59 | 20.05 | 26.28 | ▇▇▆▇▆ |
| xi | 0 | 1 | 2.65 | 1.78 | -0.31 | 1.63 | 2.27 | 3.03 | 8.37 | ▂▇▁▁▁ |
| y | 0 | 1 | 73.20 | 71.82 | -107.37 | 20.59 | 57.74 | 111.92 | 391.69 | ▁▇▃▁▁ |
| year_rec | 0 | 1 | -14.50 | 14.43 | -39.00 | -27.00 | -14.50 | -2.00 | 10.00 | ▇▇▇▇▇ |
| post | 0 | 1 | 0.22 | 0.41 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| treated | 0 | 1 | 0.10 | 0.30 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| treatedpost | 0 | 1 | 0.02 | 0.15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
mydata <- mydata %>%
mutate(year_rec = as.integer(year_rec)) %>%
as.data.frame() # need to convert to a data.frame for some functions to work
Using PanelView
p_load("panelView")
panelview(y ~ treatedpost, data = mydata,
index = c("state","year_rec"),
pre.post = TRUE)
Using Panel Match
p_load("PanelMatch")
mydata_panel <- PanelData(panel.data = mydata,
unit.id = "state_num",
time.id = "year_rec",
treatment = "treatedpost",
outcome = "y")
DisplayTreatment(panel.data = mydata_panel,
legend.position = "none",
xlab = "year", ylab = "state")
long lat group order region subregion
1 -87.46201 30.38968 1 1 alabama <NA>
2 -87.48493 30.37249 1 2 alabama <NA>
3 -87.52503 30.37249 1 3 alabama <NA>
4 -87.53076 30.33239 1 4 alabama <NA>
5 -87.57087 30.32665 1 5 alabama <NA>
6 -87.58806 30.32665 1 6 alabama <NA>
mydata2014 <- mydata %>%
filter(year==2000) %>%
mutate(region = str_to_lower(state))
mydata_maps <- left_join(mydata2014, us_states, by = "region")
p <- ggplot(data = mydata_maps,
aes(x = long, y = lat,
group = group, fill = factor(treated))) +
labs(title = "Smoking bans in the US in 2000", fill = NULL) +
geom_polygon(color = "gray90", size = 0.1) +
coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
theme_map() +
guides(fill = guide_legend(nrow = 3)) +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "bottom")
p
p_load("gtsummary")
n_treated0 <- mydata %>%
filter(treated == 0) %>%
select(state) %>%
n_distinct()
n_treated1 <- mydata %>%
filter(treated == 1) %>%
select(state) %>%
n_distinct()
tab1 <- mydata %>%
filter(post==0) %>%
select(c("xit","xt", "xi", "treated","y")) %>%
mutate(treated= case_when(treated==1~"Treated",
TRUE~"Control")) %>%
tbl_summary(
missing = "no",
by ="treated",
type = list(everything() ~ "continuous"),
digits = list(everything() ~ 2),
statistic = list(everything()~"{mean}")
) %>%
modify_spanning_header(c("stat_1", "stat_2") ~ "Table 1") %>%
modify_header(label = '**Characteristic**',
stat_1 = '**Control**, N = {n_treated0}',
stat_2 = '**Treated**, N = {n_treated1}'
)
tab1
| Characteristic |
Table 1
|
|
|---|---|---|
| Control, N = 451 | Treated, N = 51 | |
| xit | 3.60 | 6.78 |
| xt | 11.06 | 11.06 |
| xi | 2.13 | 7.26 |
| y | 54.90 | 74.95 |
| 1 Mean | ||
Balance by treated group
p1 <- mydata %>%
filter(post==0) %>%
select(xt, xi, xit, y, treated) %>%
mutate(treated= case_when(treated==1~"Treated",
TRUE~"Control")) %>%
group_by(treated) %>%
group_modify(~ {.x %>% map_dfr(mean)}) %>%
pivot_longer(cols = -treated,
names_to = c("variable"),
values_to = "mean") %>%
ggplot(aes(x=treated, y=mean, fill=factor(treated))) +
geom_bar(stat="identity", position=position_dodge())+
geom_text(aes(label = round(mean,2)),
position = position_stack(0.5),
size=3,
color = "black")+
facet_wrap(~variable, scales = "free_y") +
labs(title = "Checking for imbalance in variables pre-policy",
y = "Mean",
x = "Variables",
fill = "Treatment status")+
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
p1
Notice xt is constant across units
Balance by post period
p2 <- mydata %>%
filter(treated==0) %>%
select(xt, xi, xit, y, post) %>%
mutate(post= case_when(post==1~"After",
TRUE~"Before")) %>%
group_by(post) %>%
group_modify(~ {.x %>% map_dfr(mean)}) %>%
pivot_longer(cols = -post,
names_to = c("variable"),
values_to = "mean") %>%
ggplot(aes(x=post, y=mean, fill=factor(post))) +
geom_bar(stat="identity", position=position_dodge())+
geom_text(aes(label = round(mean,2)),
position = position_stack(0.5),
size=3,
color = "black")+
facet_wrap(~variable, scales = "free_y") +
labs(title = "Checking for imbalance in variables in control units",
y = "Mean",
x = "Variables",
fill = "Time")+
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
p2
Notice xi is constant across time
For all the data
mydata %>%
ggplot(aes(x=year, y=y, group=state)) +
annotate("rect", fill = "gray", alpha = 0.5,
xmin = 2000, xmax = 2010,
ymin = -Inf, ymax = Inf) +
labs(title = paste("Outcome by year"),
x = "Year",
y = "Outcome",
color = "Treatment") +
geom_line(aes(color=factor(treated)), size=0.5) +
scale_color_discrete(labels=c("Control", "Treated")) +
geom_vline(xintercept = year_policy, lty=2) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
For two states
mydata %>%
filter(state %in% c("California", "Georgia")) %>%
ggplot(aes(x=year, y=y, group=state)) +
annotate("rect", fill = "gray", alpha = 0.5,
xmin = 2000, xmax = 2010,
ymin = -Inf, ymax = Inf) +
labs(title = paste("Outcome by year"),
x = "Year",
y = "Outcome",
color = "Treatment") +
geom_line(aes(color=factor(treated)), size=0.5) +
scale_color_discrete(labels=c("Control", "Treated")) +
geom_vline(xintercept = year_policy, lty=2) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
On average
mydata %>%
group_by(year, treated) %>%
summarise(y=mean(y),.groups="keep") %>%
ggplot(aes(x=year, y=y, group=treated, color = factor(treated))) +
annotate("rect", fill = "gray", alpha = 0.5,
xmin = 2000, xmax = 2010,
ymin = -Inf, ymax = Inf) +
labs(title = paste("Outcome by year"),
x = "Year",
y = "Outcome",
colour = "Treatment") +
geom_line() +
scale_color_discrete(labels=c("Controls", "Treated")) +
geom_vline(xintercept = year_policy, lty=2) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))